
/* 

Paper: Gentrification and pioneer businesses 
Authors: Behrens, Boulam, Martin, Mayneris 

Name dofile: prepare_variables4reg.do  
Version: 15 nov. 2021 

Outputs:
- id_conc_pop_income 
- naics_id_conc_database 


Inputs:

- id_conc_database.dta  (from create_db_concordedblocks.do)
- gentri_dum_id_conc.dta (from prepare_gentri_dum.do)
- naics02_NETSlabel.dta
- id_conc_neighbors_mapping.dta (GIS, publicly available)
- id_conc_disc_500m.dta (GIS, publicly available)
- id_conc_disc_250m.dta (GIS, publicly available)
- duns_1990_2010_idconc_naics.dta (authors' computation from NETS and GIS)
*/ 


set more off

// demographic changes blocks (our 3 measures of gentrification)

use id_conc_database.dta, clear

tsset id_conc t
g sh_edu=edu_college_block/pop if pop!=. 
rename year_census year
rename percap_income_block pc
keep id_conc pc sh_edu year pop dis_ws
sum pop if pop>0,d 
drop if pop<8 
reshape wide sh_edu pc pop, i(id_conc dis_ws) j(year)  

foreach i in 1990 2000 2010{
local z 10 // # of bins 
xtile pctile_pc`i'=pc`i', n(`z')
}

gen poor1990= pctile_pc1990<6&pctile_pc1990!=.
gen poor2000= pctile_pc2000<6&pctile_pc2000!=.
label var poor1990 "Below median block 1990"
label var poor2000 "Below median block 2000"


gen pc_gr1990_poor=(log(pc2000)-log(pc1990))*poor1990
replace pc_gr1990_poor=. if poor1990==0

gen pc_gr2000_poor=(log(pc2010)-log(pc2000))*poor2000
replace pc_gr2000_poor=. if poor2000==0

gen sh_edu_gr1990_poor=(sh_edu2000-sh_edu1990)*poor1990
replace sh_edu_gr1990_poor=. if poor1990==0

gen sh_edu_gr2000_poor=(sh_edu2010-sh_edu2000)*poor2000
replace sh_edu_gr2000_poor=. if poor2000==0

label var pc_gr1990_poor "Per cap. inc. growth 1990-2000 x Below median block 1990"
label var pc_gr2000_poor "Per cap. inc. growth 2000-2010 x Below median block 2000"

label var sh_edu_gr1990_poor "Share college educated growth 1990-2000 x Below median block 1990"
label var sh_edu_gr2000_poor "Share college educated growth 2000-2010 x Below median block 2000"

merge 1:1 id_conc using gentri_dum_id_conc.dta
drop if _m==2
drop _m

keep id_conc pop1990 pop2000 poor1990 poor2000  pc_gr1990_poor  pc_gr2000_poor sh_edu_gr1990_poor sh_edu_gr2000_poor gentri1990 gentri200

save poor_gr.dta, replace

// create variables of demographic changes within 250m rings 

use poor_gr.dta, clear
rename id_conc id_conc_disc250m
save temp, replace 

use id_conc_disc_250m.dta, clear 
merge m:1 id_conc_disc250m using temp
keep if _m==3 
drop _m
bys id_conc: egen gentri_9000_250=max(gentri1990)
bys id_conc: egen gentri_0010_250=max(gentri2000)
bys id_conc: egen poor1990_250=max(poor1990)
bys id_conc: egen poor2000_250=max(poor2000)
bys id_conc poor1990: egen pop_tot1990=sum(pop1990)
gen sh1990=pop1990/pop_tot1990
replace sh1990=0 if sh1990==.
bys id_conc poor2000: egen pop_tot2000=sum(pop2000)
gen sh2000=pop2000/pop_tot2000
replace sh2000=0 if sh2000==.
drop pop_tot1990 pop_tot2000

foreach v in "pc_gr" "sh_edu_gr"{
bys id_conc: egen `v'1990_poor_250=sum(sh1990*`v'1990_poor)
bys id_conc: egen `v'2000_poor_250=sum(sh2000*`v'2000_poor)

bys id_conc: egen `v'1990_poor_250_max=max(`v'1990_poor)
replace `v'1990_poor_250_max=0 if  poor1990_250==0
bys id_conc: egen `v'2000_poor_250_max=max(`v'2000_poor)
replace `v'2000_poor_250_max=0 if  poor2000_250==0

bys id_conc: egen `v'1990_poor_250_med=median(`v'1990_poor)
replace `v'1990_poor_250_med=0 if poor1990_250==0

bys id_conc: egen `v'2000_poor_250_med=median(`v'2000_poor)
replace `v'2000_poor_250_med=0 if poor2000_250==0

replace `v'1990_poor_250=. if `v'1990_poor_250_max==.
replace `v'2000_poor_250=. if `v'2000_poor_250_max==.
drop `v'1990_poor_250_max `v'2000_poor_250_max
}

duplicates drop id_conc, force
keep id_conc poor1990_250 poor2000_250 gentri_9000_250 gentri_0010_250 pc_gr1990_poor_250* pc_gr2000_poor_250*  sh_edu_gr1990_poor_250* sh_edu_gr2000_poor_250* 
save gentri_250, replace

use id_conc_database, clear
keep id_conc t pop housing edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block median_yearb_block median_crent_block median_grent_block median_value_block percap_income_block emp* nbplants*
rename id_conc id_conc_disc250m 


joinby id_conc_disc250m using id_conc_disc_250m.dta

foreach v of var median_yearb_block median_crent_block median_grent_block median_value_block  {
bys id_conc t: egen housing_tot=sum(housing) if (`v'>0&`v'!=.)
gen sh=housing/housing_tot
replace `v'=sh*`v'
drop sh housing_tot
}

foreach v of var percap_income_block {
bys id_conc t: egen pop_tot=sum(pop) if `v'!=.
gen sh=pop/pop_tot
replace `v'=sh*`v'
drop sh pop_tot
}

collapse (sum) pop edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block percap_income_block housing median_yearb_block median_crent_block median_grent_block median_value_block emp* nbplants*, by(id_conc t)

rename edu_college_block edu 
rename pop_black_block black
rename pop_asian_block asian
rename pop_white_block white
rename pop_other_block other
rename percap_income_block pc_income
rename median_value_block housing_value
rename median_crent_block crent
rename median_grent_block grent
rename median_yearb med_yearb

foreach v of var pop edu black asian white other pc_income housing med_yearb crent grent housing_value emp* nbplants*{
rename `v' `v'250
}

foreach v of var pop* edu* black* asian* white* other* pc_income* housing* med_yearb* crent* grent* emp*{
g l`v' =log(`v') 
}

merge m:1 id_conc using gentri_250 // we merge gentri information at the neighborhood and id_conc level
drop _m

gen year=1990
replace year=2000 if t==2
replace year=2010 if t==3

gen gentri_init250= ((gentri_9000_250==1&year==1990)|(gentri_0010_250==1&year==2000))
label var gentri_init250 "Gentrifying id_conc in a 250m radius, beginning of the period"

gen poor_init250= ((poor1990_250==1&year==1990)|(poor2000_250==1&year==2000))
label var poor_init250 "Poor id_conc in a 250m radius, beginning of the period"

foreach n in "" "_med"{
gen pc_gr_poor_init250`n'=pc_gr1990_poor_250`n' if year==1990 
replace pc_gr_poor_init250`n'=pc_gr2000_poor_250`n' if year==2000

gen sh_edu_gr_poor_init250`n'=sh_edu_gr1990_poor_250`n' if year==1990 
replace sh_edu_gr_poor_init250`n'=sh_edu_gr2000_poor_250`n' if year==2000

}

save temp2, replace 

use id_conc_database, clear

rename edu_college_block edu 
rename pop_black_block black
rename pop_asian_block asian
rename pop_white_block white
rename pop_other_block other
rename percap_income_block pc_income
rename median_value_block housing_value
rename median_crent_block crent
rename median_grent_block grent
rename median_yearb med_yearb


keep id_conc t dis_ws pop edu black asian white other pc_income housing med_yearb crent grent housing_value emp* nbplants*

foreach v of var pop edu black asian white other pc_income housing med_yearb crent grent  emp{
g l`v' =log(`v') 
}

merge 1:1 id_conc t using temp2
keep if _m==3
drop _m

keep if dis_ws<=30 // we keep blocks within 30km from WS
tsset id_conc t
save id_conc_pop_income, replace 


// create variables of demographic changes within 500m rings 

use poor_gr.dta, clear
rename id_conc id_conc_disc500m
save temp, replace 

use id_conc_disc_500m.dta, clear 
merge m:1 id_conc_disc500m using temp
keep if _m==3 
drop _m

bys id_conc: egen gentri_9000_500=max(gentri1990)
bys id_conc: egen gentri_0010_500=max(gentri2000)
bys id_conc: egen poor1990_500=max(poor1990)
bys id_conc: egen poor2000_500=max(poor2000)

bys id_conc poor1990: egen pop_tot1990=sum(pop1990)
gen sh1990=pop1990/pop_tot1990
replace sh1990=0 if sh1990==.

bys id_conc poor2000: egen pop_tot2000=sum(pop2000)
gen sh2000=pop2000/pop_tot2000
replace sh2000=0 if sh2000==.

drop pop_tot1990 pop_tot2000

foreach v in "pc_gr" "sh_edu_gr"{
bys id_conc: egen `v'1990_poor_500=sum(sh1990*`v'1990_poor)
bys id_conc: egen `v'2000_poor_500=sum(sh2000*`v'2000_poor)

bys id_conc: egen `v'1990_poor_500_max=max(`v'1990_poor)
replace `v'1990_poor_500_max=0 if  poor1990_500==0
bys id_conc: egen `v'2000_poor_500_max=max(`v'2000_poor)
replace `v'2000_poor_500_max=0 if  poor2000_500==0

bys id_conc: egen `v'1990_poor_500_med=median(`v'1990_poor)
replace `v'1990_poor_500_med=0 if poor1990_500==0
bys id_conc: egen `v'2000_poor_500_med=median(`v'2000_poor)
replace `v'2000_poor_500_med=0 if poor2000_500==0

replace `v'1990_poor_500=. if `v'1990_poor_500_max==.
replace `v'2000_poor_500=. if `v'2000_poor_500_max==.
drop `v'1990_poor_500_max `v'2000_poor_500_max
}

duplicates drop id_conc, force

keep id_conc poor1990_500 poor2000_500  gentri_9000_500 gentri_0010_500 pc_gr1990_poor_500* pc_gr2000_poor_500* sh_edu_gr1990_poor_500* sh_edu_gr2000_poor_500* 
saveold gentri_500, v(13) replace

use id_conc_database, clear
keep id_conc t pop housing edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block median_yearb_block median_crent_block median_grent_block median_value_block percap_income_block emp* nbplants*
rename id_conc id_conc_disc500m 


joinby id_conc_disc500m using id_conc_disc_500m.dta

foreach v of var median_yearb_block median_crent_block median_grent_block median_value_block  {
bys id_conc t: egen housing_tot=sum(housing) if (`v'>0&`v'!=.)
gen sh=housing/housing_tot
replace `v'=sh*`v'
drop sh housing_tot
}

foreach v of var percap_income_block {
bys id_conc t: egen pop_tot=sum(pop) if `v'!=.
gen sh=pop/pop_tot
replace `v'=sh*`v'
drop sh pop_tot
}

collapse (sum) pop edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block percap_income_block housing median_yearb_block median_crent_block median_grent_block median_value_block emp* nbplants*, by(id_conc t)

rename edu_college_block edu 
rename pop_black_block black
rename pop_asian_block asian
rename pop_white_block white
rename pop_other_block other
rename percap_income_block pc_income
rename median_value_block housing_value
rename median_crent_block crent
rename median_grent_block grent
rename median_yearb med_yearb

foreach v of var pop edu black asian white other pc_income housing med_yearb crent grent housing_value emp* nbplants*{
rename `v' `v'500
}

foreach v of var pop* edu* black* asian* white* other* pc_income* housing* med_yearb* crent* grent*  emp*{
g l`v' =log(`v') 
}

merge m:1 id_conc using gentri_500
drop _m

gen year=1990
replace year=2000 if t==2
replace year=2010 if t==3

gen gentri_init500= ((gentri_9000_500==1&year==1990)|(gentri_0010_500==1&year==2000))
label var gentri_init500 "Gentrifying id_conc in a 500m radius, beginning of the period"

gen poor_init500= ((poor1990_500==1&year==1990)|(poor2000_500==1&year==2000))
label var poor_init500 "Poor id_conc in a 500m radius, beginning of the period"

foreach n in "" "_med"{
gen pc_gr_poor_init500`n'=pc_gr1990_poor_500`n' if year==1990 
replace pc_gr_poor_init500`n'=pc_gr2000_poor_500`n' if year==2000

gen sh_edu_gr_poor_init500`n'=sh_edu_gr1990_poor_500`n' if year==1990 
replace sh_edu_gr_poor_init500`n'=sh_edu_gr2000_poor_500`n' if year==2000

}
save temp2, replace 

use id_conc_pop_income, clear
merge 1:1 id_conc t using temp2
drop if _m==2
drop _m
save id_conc_pop_income, replace 

// create variables of demographic changes within contiguous blocks 

use poor_gr.dta, clear
rename id_conc id_conc_neighbors
save temp, replace

use id_conc_neighbors_mapping.dta, clear 
merge m:1 id_conc_neighbors using temp
keep if _m==3 
drop _m

bys id_conc: egen gentri_9000_cont=max(gentri1990)
bys id_conc: egen gentri_0010_cont=max(gentri2000)
bys id_conc: egen poor1990_cont=max(poor1990)
bys id_conc: egen poor2000_cont=max(poor2000)
bys id_conc poor1990: egen pop_tot1990=sum(pop1990)
gen sh1990=pop1990/pop_tot1990
replace sh1990=0 if sh1990==.
bys id_conc poor2000: egen pop_tot2000=sum(pop2000)
gen sh2000=pop2000/pop_tot2000
replace sh2000=0 if sh2000==.
drop pop_tot1990 pop_tot2000

foreach v in "pc_gr" "sh_edu_gr"{
bys id_conc: egen `v'1990_poor_cont=sum(sh1990*`v'1990_poor)
bys id_conc: egen `v'2000_poor_cont=sum(sh2000*`v'2000_poor)

bys id_conc: egen `v'1990_poor_cont_max=max(`v'1990_poor)
replace `v'1990_poor_cont_max=0 if  poor1990_cont==0
bys id_conc: egen `v'2000_poor_cont_max=max(`v'2000_poor)
replace `v'2000_poor_cont_max=0 if  poor2000_cont==0

bys id_conc: egen `v'1990_poor_cont_med=median(`v'1990_poor)
replace `v'1990_poor_cont_med=0 if poor1990_cont==0
bys id_conc: egen `v'2000_poor_cont_med=median(`v'2000_poor)
replace `v'2000_poor_cont_med=0 if poor2000_cont==0

replace `v'1990_poor_cont=. if `v'1990_poor_cont_max==.
replace `v'2000_poor_cont=. if `v'2000_poor_cont_max==.
drop `v'1990_poor_cont_max `v'2000_poor_cont_max
}

duplicates drop id_conc, force

keep id_conc poor1990_cont poor2000_cont gentri_9000_cont gentri_0010_cont pc_gr1990_poor_cont* pc_gr2000_poor_cont*  sh_edu_gr1990_poor_cont*  sh_edu_gr2000_poor_cont* 
save gentri_cont, replace

use id_conc_database, clear
keep id_conc t pop housing edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block median_yearb_block median_crent_block median_grent_block median_value_block percap_income_block emp* nbplants*
rename id_conc id_conc_neighbors

joinby id_conc_neighbors using id_conc_neighbors_mapping.dta

foreach v of var median_yearb_block median_crent_block median_grent_block median_value_block  {
bys id_conc t: egen housing_tot=sum(housing) if (`v'>0&`v'!=.)
gen sh=housing/housing_tot
replace `v'=sh*`v'
drop sh housing_tot
}

foreach v of var percap_income_block {
bys id_conc t: egen pop_tot=sum(pop) if `v'!=.
gen sh=pop/pop_tot
replace `v'=sh*`v'
drop sh pop_tot
}

collapse (sum) pop edu_college_block pop_black_block pop_asian_block pop_white_block pop_other_block percap_income_block housing median_yearb_block median_crent_block median_grent_block median_value_block emp* nbplants*, by(id_conc t)

rename edu_college_block edu 
rename pop_black_block black
rename pop_asian_block asian
rename pop_white_block white
rename pop_other_block other
rename percap_income_block pc_income
rename median_value_block housing_value
rename median_crent_block crent
rename median_grent_block grent
rename median_yearb med_yearb

foreach v of var pop edu black asian white other pc_income housing med_yearb crent grent housing_value emp* nbplants*{
rename `v' `v'cont
}

foreach v of var pop* edu* black* asian* white* other* pc_income* housing* med_yearb* crent* grent* emp*{
g l`v' =log(`v') 
}

merge m:1 id_conc using gentri_cont
drop _m

gen year=1990
replace year=2000 if t==2
replace year=2010 if t==3

gen gentri_initcont= ((gentri_9000_cont==1&year==1990)|(gentri_0010_cont==1&year==2000))
label var gentri_initcont "Gentrifying id_conc in a contiguous block, beginning of the period"

gen poor_initcont= ((poor1990_cont==1&year==1990)|(poor2000_cont==1&year==2000))
label var poor_initcont "Poor id_conc in a contiguous block, beginning of the period"

foreach n in "" "_med"{
gen pc_gr_poor_initcont`n'=pc_gr1990_poor_cont`n' if year==1990 
replace pc_gr_poor_initcont`n'=pc_gr2000_poor_cont`n' if year==2000

gen sh_edu_gr_poor_initcont`n'=sh_edu_gr1990_poor_cont`n' if year==1990 
replace sh_edu_gr_poor_initcont`n'=sh_edu_gr2000_poor_cont`n' if year==2000

}

save temp2, replace 

use id_conc_pop_income, clear
merge 1:1 id_conc t using temp2
drop if _m==2
drop _m

tsset id_conc t

foreach n in "250" "500" "cont"{
gen sh_edu`n'=edu`n'/pop`n'
}

foreach v of var lpc_income250 lpc_income500 lpc_incomecont sh_edu250 sh_edu500 sh_educont{
gen d`v'=f.`v'-`v'
}

save id_conc_pop_income, replace 

erase "temp.dta"
erase "temp2.dta"
erase "gentri_250.dta"
erase "gentri_500.dta"
erase "gentri_cont.dta"


// Add establishment data

use naics02_NETSlabel.dta, clear
gen naics=substr(naics02,1,5)
bys naics: egen var=count(naics)
tab var
keep if var==1
keep naics naics02
save naics5, replace

use naics02_NETSlabel.dta, clear
gen naics=substr(naics02,1,4)
bys naics: egen var=count(naics)
tab var
keep if var==1
keep naics naics02
save naics4, replace


use duns_1990_2010_idconc_naics.dta, clear 
* first, we check of the way naics is filled in and make corrections when necessary
tostring naics, replace
count if length(naics)!=6
count if length(naics)!=6&naics!="."

merge m:1 naics using naics5
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
keep if toto==6
drop toto

egen ind=group(naics)
sum ind


keep if year==1990|year==2000|year==2010

gen nb_plants_sect=1
gen emp_sect=emp

gen t=1 if year==1990
replace t=2 if year==2000
replace t=3 if year==2010

collapse (sum) nb_plants_sect emp_sect, by(id_conc naics t year)

merge m:1 id_conc t using id_conc_pop_income 
keep if _m==3
drop _m

keep year id_conc naics t nb_plants_sect emp_sect
egen sect=group(naics)

egen _=count(year), by(naics year) 
keep if _>99 // at least 100 blocks 
drop _
saveold naics_id_conc_database, replace v(13)





